The method: Network Analysis

00 Preparing data

0.3 Loading packages

library(tidyverse) # coding with tidy style 
## ── Attaching packages ────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate) # processing time
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(igraph) # plotting network
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
options(stringsAsFactors = F) # avoid converting character to factor silently

0.2 Loading data

users_1 <- read_csv("data/china_082019_1_users_csv_hashed.csv")
## Parsed with column specification:
## cols(
##   userid = col_character(),
##   user_display_name = col_character(),
##   user_screen_name = col_character(),
##   user_reported_location = col_character(),
##   user_profile_description = col_character(),
##   user_profile_url = col_character(),
##   follower_count = col_double(),
##   following_count = col_double(),
##   account_creation_date = col_date(format = ""),
##   account_language = col_character()
## )
tweets_1 <- read_csv("data/china_082019_1_tweets_csv_hashed.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   tweetid = col_double(),
##   userid = col_double(),
##   user_profile_url = col_logical(),
##   follower_count = col_double(),
##   following_count = col_double(),
##   account_creation_date = col_date(format = ""),
##   tweet_time = col_datetime(format = ""),
##   in_reply_to_userid = col_double(),
##   in_reply_to_tweetid = col_double(),
##   quoted_tweet_tweetid = col_double(),
##   is_retweet = col_logical(),
##   retweet_userid = col_logical(),
##   retweet_tweetid = col_logical(),
##   quote_count = col_double(),
##   reply_count = col_double(),
##   like_count = col_double(),
##   retweet_count = col_double(),
##   poll_choices = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 951646 parsing failures.
##  row              col           expected                                    actual                                        file
## 1003 retweet_userid   1/0/T/F/TRUE/FALSE 2896172507                                'data/china_082019_1_tweets_csv_hashed.csv'
## 1003 retweet_tweetid  1/0/T/F/TRUE/FALSE 581033920591859712                        'data/china_082019_1_tweets_csv_hashed.csv'
## 1052 userid           a double           nXJwGmcG71Ho3srqnXTSnyEokZXke8tkdISrrycI= 'data/china_082019_1_tweets_csv_hashed.csv'
## 1052 user_profile_url 1/0/T/F/TRUE/FALSE http://t.co/x5Ctw87c                      'data/china_082019_1_tweets_csv_hashed.csv'
## 1076 retweet_userid   1/0/T/F/TRUE/FALSE 2896172507                                'data/china_082019_1_tweets_csv_hashed.csv'
## .... ................ .................. ......................................... ...........................................
## See problems(...) for more details.
# users_2 <- read_csv("data/china_082019_2_users_csv_hashed.csv")
# tweets_2 <- read_csv("data/china_082019_2_tweets_csv_hashed.csv")

0.3 Previewing data

tweets_1 %>% glimpse()
## Observations: 1,906,831
## Variables: 31
## $ tweetid                  <dbl> 4.363064e+17, 5.826830e+17, 5.829979e+1…
## $ userid                   <dbl> 206027550, 206027550, 206027550, 206027…
## $ user_display_name        <chr> "The Flecha", "The Flecha", "The Flecha…
## $ user_screen_name         <chr> "nessniven", "nessniven", "nessniven", …
## $ user_reported_location   <chr> "Republic of Korea", "Republic of Korea…
## $ user_profile_description <chr> "When you see Jair Bolsonaro and his pr…
## $ user_profile_url         <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ follower_count           <dbl> 100847, 100847, 100847, 100847, 100847,…
## $ following_count          <dbl> 31752, 31752, 31752, 31752, 31752, 3175…
## $ account_creation_date    <date> 2010-10-22, 2010-10-22, 2010-10-22, 20…
## $ account_language         <chr> "en", "en", "en", "en", "en", "en", "en…
## $ tweet_language           <chr> "pt", "pt", "pt", "pt", NA, "pt", "pt",…
## $ tweet_text               <chr> "segui o @instagranzin e ganhei 89 segu…
## $ tweet_time               <dttm> 2014-02-20 01:08:00, 2015-03-30 23:17:…
## $ tweet_client_name        <chr> "TweetDeck", "TweetDeck", "TweetDeck", …
## $ in_reply_to_userid       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ in_reply_to_tweetid      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ quoted_tweet_tweetid     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ is_retweet               <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
## $ retweet_userid           <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ retweet_tweetid          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ latitude                 <chr> "absent", "absent", "absent", "absent",…
## $ longitude                <chr> "absent", "absent", "absent", "absent",…
## $ quote_count              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, …
## $ reply_count              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 18, 1, 1, 1, 1,…
## $ like_count               <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ retweet_count            <dbl> 0, 52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ hashtags                 <chr> NA, "[]", "[]", "[]", NA, "[]", "[]", "…
## $ urls                     <chr> NA, "['http://goo.gl/32K38q']", "[]", "…
## $ user_mentions            <chr> "[443196382]", "[]", "[116351908]", "[]…
## $ poll_choices             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…

01 Preparing network data

1.1 Extracting RT(Retweet) relations

  • RT relationship:user_screen_name -> RT @someone said something
  • 意即「誰(user_screen_name, 在此為被刪除的帳號們)」RT了@someone(某人,不見得只有被刪除的帳號)
  • 在網絡分析上,通常以from-to來表達一個有向關係(A RT B為有向關係)
  • 將推文時間當作第三個變項tweet_time
# Filtering RT relation (366,253 out of 1,906.831)
rt_1 <- tweets_1 %>%
    select(user_screen_name, tweet_text, tweet_time) %>%
    filter(str_detect(tweet_text, "RT @"))

# In each tweet, detecting retweet targets when number-of-RT  ==  1.
rt_edge_1 <- rt_1 %>%
    mutate(rt_count = str_count(tweet_text, "RT @\\w+")) %>%
    filter(rt_count == 1) %>%
    mutate(to = str_extract(tweet_text, "RT @\\w+")) %>%
    mutate(to = str_sub(to, 5, nchar(to))) %>%
    select(from = user_screen_name, to, tweet_time)

# In each tweet, detecting retweet targets when number-of-RT  >  1.
rt_edge <- rt_1 %>%
    mutate(rt_count = str_count(tweet_text, "RT @\\w+")) %>%
    filter(rt_count > 1) %>%
    mutate(to = str_extract_all(tweet_text, "RT @\\w+")) %>%
    unnest(to) %>%
    mutate(to = str_sub(to, 5, nchar(to))) %>%
    select(from = user_screen_name, to, tweet_time) %>%
    bind_rows(rt_edge_1)

1.2 Extracting innert group RT relations

rt_edge_inner <- rt_edge %>%
    filter(to %in% users_1$user_screen_name)

02 Network data analysis

2.1 data frame to network

library(igraph)

g <- rt_edge_inner %>% 
    mutate(year = year(tweet_time)) %>%
    # filter(year >= 2019) %>%
    count(from, to, year) %>%
    select(from, to, year, weight = n) %>%
    graph.data.frame(directed = T)

2.2 Inspecting vertices(V) and edges(E)

E(g)$weight
##   [1]  1  1  2  1  1  1  1  1  1  1  2  1  1  3  1  5  1  1  1  1  1  1  2
##  [24]  1  1  2  1  1  9  1  4  3  2  3  3  2  1  2  1  1  1  1  1  1  1  1
##  [47]  1  1  1  1  1  5  1  1  1  1  1  1  1  1  1  3  1  1  1  2  3  1  1
##  [70]  1  1  1  1  1  1  1  1  4  1  7  7  3  3  4  2  4  3  1  1  1  2  2
##  [93]  1  1  5  1  1  1  1  1  1  1  1  2  1  2  1  1  1  1  1  1  5  1  1
## [116]  5  6  2  1  1  2  5  1  1  1  1  1  1  1  3  1  2  7  1  1  1  1  1
## [139]  1  1  1  1  6  1  1  1  1  2  1  1  1  1  2  1  1  2 49  2  2  2  1
## [162]  2  1  1  1  1  1  1  1  1  1  2  1  2  2  3  1  1  2  1  1  1  1  5
## [185]  1  1  1  1  2  1  1  1  1  1  3  1  1  1  1  1  1  1  5  1  5  9  2
## [208]  1  1  3  1  1  3  1  2  2  1  2  2  1  1  1  1  1  1  1  2  1  1  1
## [231]  1  1  1  1  2  1  1  1  1  1  2  2  2  1  1  1  1  1  3  1  2  7  1
## [254]  1  1  1  1  1  2  2  3  1  1  1  2  1  1  1  1  1  1  1  1  1  2  2
## [277]  2  2  4  4  4  4  1  1  1  1  1  1  1  1  1  1  2  1  1  1  1  1  1
## [300]  1  2  2  1  1  1  1  1  1  1  2  1  2  2  4  3  1  5  2  5  5  2  1
## [323]  7  4  1  1  1  1  1  3  1  1  1  2  1  4  1  1  1  1  4  1  2  3  1
## [346]  1  1  1  3  3  1  3  1  1  1  1  1  1  2  1  1  1  1  2  1  3  3  1
## [369]  1  1  1  1  1  1  2  1  1  1  1  1  1  1  2  1  1  1  1  1  1  1  3
## [392] 11  4  1  1  1  1  1  2  2  1  2  1  2  1  1  1  1  1  1  1  1  1  1
## [415]  1  1  2  1  1  1  1  1  2  1  1  1  1  1  1 16  1  1  1  1  1  1  1
E(g)$year %>% head
## [1] 2018 2019 2018 2017 2017 2017
# V(g)
# V(g)$name

2.3 Detecting communitie

# Detecting community by information flow
V(g)$comm <- membership(infomap.community(g))

# V(g)$comm
table(V(g)$comm)
## 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  9 57 12 20  4 12 10  3  3 10  6  5  2  7  6  6  5  5  4  3  3  3  3  2  2

2.4 Computing vertices properties

  • for detecting prominent actors, users, or nodes in a graph
# centrality degree
V(g)$degree <- degree(g, mode="all")

# in-degree: interaction from out to in.
V(g)$indegree <- degree(g, mode="in")

#out-degree: Interaction from in to out
V(g)$outdegree <- degree(g, mode="out")

# closeness centrality: average of the shortest path length to other nodes
V(g)$closeness <- centralization.closeness(g)$res
## Warning in centralization.closeness(g): At centrality.c:2784 :closeness
## centrality is not well-defined for disconnected graphs
# betweenness centrality: bridging degree of vertices
V(g)$betweenness <- centralization.betweenness(g)$res

2.5 Tabulating node properties

nodes_all <- data.frame(name = V(g)$name,
                        degree = V(g)$degree,
                        indegree = V(g)$indegree,
                        outdegree = V(g)$outdegree,
                        closeness = V(g)$closeness,
                        community = V(g)$comm,
                        betweeness =V(g)$betweenness)

# nodes_all %>% View

03 Network Visualization

3.1 Basic plotting

plot(g)

plot(g, edge.arrow.size = .4, vertex.label = NA)

3.2 Plotting vertices degree distribution

plot(degree.distribution(g, cumulative=T), pch=20, xlab="degree", ylab="cumulative frequency")

3.3 Better network plotting

l <- layout.fruchterman.reingold(g)
# l <- layout_with_kk(g)

plot(g, 
     layout = l,
     # vertex.color = rgb(1, 1, 0, 0.2),
     vertex.color = factor(V(g)$comm),
     vertex.size  = sqrt(V(g)$degree)*3,
     vertex.frame.color= rgb(0, 0, 0, 0.5),
     vertex.label = str_sub(V(g)$name, 1, 10),
     vertex.label.cex = 0.6,
     vertex.label.color = rgb(0, 0, 0, 0.5),
     edge.curved = 0.1,
     edge.arrow.size = 0.1, 
     edge.width = sqrt(E(g)$weight+1),
     edge.color = E(g)$year,
     edge.label = E(g)$year,
     edge.label.cex = 0.4,
     edge.label.color = E(g)$year
     )

04 Plotting network only for hk-anti-extraction

4.2 Segmenting RT actions before and after 2019-06-05

  • before: 2019-01-01 ~ 2019-06-04
  • after: 2019-06-05 ~
hk_rt_compare <- hk_tweets %>%
    mutate(date0605 = if_else(tweet_time > as.Date("2019-06-05"), 
                                  "after", "before")) %>%
    filter(str_detect(tweet_text,"RT @")) %>%
    select(user_screen_name, tweet_text, date0605)
    
hk_rt_compare  %>% count(date0605)

4.3 Segmenting data before and after 2019-06-05

# Building rt edgelist of hk related tweets
hk_rt_edgelist <- hk_rt_compare %>%
    mutate(rt = str_extract_all(tweet_text, "RT @\\w+")) %>%
    unnest(rt) %>%
    mutate(to = str_sub(rt, 5, nchar(rt))) %>%
    # filter(to %in% users_1$user_screen_name) %>%
    select(from = user_screen_name, to, date0605)

4.4 Building network

# Building graph: converting from edgelist
g_hk <- hk_rt_edgelist %>% 
    count(from, to, date0605) %>%
    select(from, to, date0605, weight = n) %>%
    graph.data.frame(directed = T)

# Examing network properties: communities
V(g_hk)$comm <- membership(infomap.community(g_hk))
table(V(g_hk)$comm)
## 
##   1   2   3   4   5   6   7   8   9 
## 147  12   5   4   3   2   2   2   2
# Examing ego-network properties
V(g_hk)$degree <- degree(g_hk, mode="all")
V(g_hk)$indegree <- degree(g_hk, mode="in")
V(g_hk)$outdegree <- degree(g_hk, mode="out")
V(g_hk)$closeness <- centralization.closeness(g_hk)$res
## Warning in centralization.closeness(g_hk): At centrality.c:2784 :closeness
## centrality is not well-defined for disconnected graphs
V(g_hk)$betweenness <- centralization.betweenness(g_hk)$res
V(g_hk)$deleted <- if_else(V(g_hk)$name %in% users_1$user_screen_name, 
                           rgb(1, 0.25, 0, 0.3), rgb(0, 0, 0, 0.1))

# Saving network properties into a table
nodes_hk_rt <- data_frame(name = V(g_hk)$name,
                        degree = V(g_hk)$degree,
                        indegree = V(g_hk)$indegree,
                        outdegree = V(g_hk)$outdegree,
                        closeness = V(g_hk)$closeness,
                        community = V(g_hk)$comm,
                        betweeness =V(g_hk)$betweenness,
                        deleted = V(g_hk)$deleted)
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.

4.5 Plotting network

# Plotting network
plot(g_hk)

plot(g_hk, edge.arrow.size=.4,vertex.label=NA)

plot(degree.distribution(g_hk, cumulative=T), pch=20,xlab="degree", ylab="cumulative frequency")

4.6 Better plotting

# Better network plotting
l <- layout.fruchterman.reingold.grid(g_hk)
## Warning in layout.fruchterman.reingold.grid(g_hk): Grid Fruchterman-Reingold layout was removed,
## we use Fruchterman-Reingold instead.
plot(g_hk, 
     layout = l,
     vertex.color = V(g_hk)$deleted,
     vertex.size  = sqrt(V(g_hk)$degree)*2,
     vertex.frame.color= rgb(0, 0, 0, 0.5),
     vertex.label = str_sub(V(g_hk)$name, 1, 10),
     vertex.label.cex = 0.6,
     vertex.label.color = rgb(0, 0, 0, 0.5),
     edge.curved = .1,
     edge.arrow.size = .1, 
     edge.width = E(g_hk)$weight/2,
     edge.color = factor(E(g_hk)$date0605)
     )